import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from lime.lime_tabular import LimeTabularExplainer
import seaborn as sns
import matplotlib.pyplot as plt
!unzip -o titanic.zip
Archive: titanic.zip inflating: gender_submission.csv inflating: test.csv inflating: train.csv
df = pd.read_csv('./train.csv', index_col='PassengerId')
df.reset_index(inplace=True)
df['Sex'] = (df['Sex'] == 'female').astype(int)
df_train, df_test = train_test_split(df, test_size=0.2, shuffle=False)
continous_columns = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_columns = [ 'Pclass', 'Sex']
model_columns = categorical_columns + continous_columns
X_train, y_train = df_train[model_columns], df_train['Survived']
X_test, y_test = df_test[model_columns], df_test['Survived']
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
preprocess = make_pipeline(imputer)
lgbm = LGBMClassifier(max_depth=4)
model = make_pipeline(preprocess, lgbm)
model.fit(X_train,y_train, lgbmclassifier__categorical_feature=[model_columns.index(col) for col in categorical_columns])
/home/azapala/anaconda3/envs/main/lib/python3.8/site-packages/lightgbm/basic.py:1551: UserWarning: Using categorical_feature in Dataset.
warnings.warn('Using categorical_feature in Dataset.')
Pipeline(steps=[('pipeline',
Pipeline(steps=[('simpleimputer',
SimpleImputer(fill_value=0,
strategy='constant'))])),
('lgbmclassifier', LGBMClassifier(max_depth=4))])
train_preds = model.predict_proba(X_train)[:,1]
test_preds = model.predict_proba(X_test)[:,1]
print("Roc auc train:", roc_auc_score(y_train, train_preds))
print("Roc auc test:", roc_auc_score(y_test, test_preds))
Roc auc train: 0.9522220932931075 Roc auc test: 0.9042119565217391
passanger_index = [60, 250, 512, 300, 100]
df_selected = X_train.iloc[passanger_index, :]
df_selected['preds'] = model.predict_proba(df_selected[model_columns])[:, 1]
df_selected
<ipython-input-6-e87969287865>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_selected['preds'] = model.predict_proba(df_selected[model_columns])[:, 1]
| Pclass | Sex | Age | SibSp | Parch | Fare | preds | |
|---|---|---|---|---|---|---|---|
| 60 | 3 | 0 | 22.0 | 0 | 0 | 7.2292 | 0.248554 |
| 250 | 3 | 0 | NaN | 0 | 0 | 7.2500 | 0.021391 |
| 512 | 1 | 0 | 36.0 | 0 | 0 | 26.2875 | 0.817681 |
| 300 | 3 | 1 | NaN | 0 | 0 | 7.7500 | 0.786915 |
| 100 | 3 | 1 | 28.0 | 0 | 0 | 7.8958 | 0.406560 |
df_selected.drop(columns='preds', inplace=True)
/home/azapala/.local/lib/python3.8/site-packages/pandas/core/frame.py:4308: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
X_train[model_columns] = imputer.fit_transform(X_train[model_columns])
<ipython-input-8-7d236800544f>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy X_train[model_columns] = imputer.fit_transform(X_train[model_columns]) /home/azapala/.local/lib/python3.8/site-packages/pandas/core/indexing.py:1738: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self._setitem_single_column(loc, value[:, i].tolist(), pi)
explainer = LimeTabularExplainer(preprocess.transform(X_train),
feature_names=model_columns,
class_names=['died', 'survived'],
categorical_features=[ model_columns.index(cat) for cat in categorical_columns],
discretize_continuous=False,
verbose=True)
def show_lime_explanation(row, predict_proba):
lime = explainer.explain_instance(preprocess.transform(row).ravel(), predict_proba)
lime.show_in_notebook(show_table=True)
for i in passanger_index:
print("Observation number:", i)
show_lime_explanation(X_train.iloc[[i]], lgbm.predict_proba)
Observation number: 60 Intercept 0.7639779264559281 Prediction_local [0.17188702] Right: 0.24855399739001624
Observation number: 250 Intercept 0.7495446284447234 Prediction_local [0.31630334] Right: 0.021391401419326654
Observation number: 512 Intercept 0.6572969934664611 Prediction_local [0.33704343] Right: 0.8176810799995181
Observation number: 300 Intercept 0.4043681005086558 Prediction_local [0.67869542] Right: 0.7869153888053276
Observation number: 100 Intercept 0.3883081868326946 Prediction_local [0.53834482] Right: 0.4065603563396111
Dla obserwacji 250 i 512 predykcję lokalne znacznie odbiegają od modelu lgbm, wyjaśnienia dla tych obserwacji mają niską wiarygodność. We wszystkich obserwacjach płeć ma największe znaczenie (żeńska dodatnią atrybucje, męska ujemną atrybucje). Można zauważyć, że intercept dla obserwacji o płci żenskiej jest niższy niż dla płci męskiej. Może wynikać to z faktu odwrotnego kodowania płci w zależności od rozważanej obserwacji tj. w przypadku mężczyzn mamy cechę one-hot-encoding "Sex=0" o dużej wadzę ujemnej a dla kobiet "Sex=1" o dużej wadzę dodatniej. Istotności zmiennych są zblizone we wszystkich analizach (wagi zmiennych kategorycznych rozważamy wraz z wartościami). Stąd możemy swteirdzić, że nasze wyjaśnienia są stabilne.
net = MLPClassifier(hidden_layer_sizes=(60,50), max_iter=500)
model_net = make_pipeline(preprocess, net)
model_net.fit(X_train, y_train)
Pipeline(steps=[('pipeline',
Pipeline(steps=[('simpleimputer',
SimpleImputer(fill_value=0,
strategy='constant'))])),
('mlpclassifier',
MLPClassifier(hidden_layer_sizes=(60, 50), max_iter=500))])
train_preds = model_net.predict_proba(X_train)[:,1]
test_preds = model_net.predict_proba(X_test)[:,1]
print("Roc auc train:", roc_auc_score(y_train, train_preds))
print("Roc auc test:", roc_auc_score(y_test, test_preds))
Roc auc train: 0.8720493651162019 Roc auc test: 0.8858695652173912
i = 88
print("Numer obserwacji:", i)
print("Sieć neuronowa:")
show_lime_explanation(X_train.iloc[[i]], net.predict_proba)
print("Model drzewiasty:")
show_lime_explanation(X_train.iloc[[i]], lgbm.predict_proba)
Numer obserwacji: 88 Sieć neuronowa: Intercept 0.3267707371136239 Prediction_local [0.88496552] Right: 0.8508629705614504
Model drzewiasty: Intercept 0.3263938642033223 Prediction_local [0.74899012] Right: 0.7899359109119046
Dla obserwacji nr 88. Fare w przypadku sieci neuronowej ma wagę 0.08 natomiast dla lgb 0.01. Natomiast cecha Pclass=1 dla sieci ma niewielką wagę dodatnią natomiast dla modelu drzewiastego sporą wartość dodatnią. Ta zależność może wynikać z tego, że te dwie cechy są komplementarne/ silnie zależne. Zmienna wiek ma atrybucję o różnych znakach w tych dwóch wyjaśnieniach. Współczynniki wolne w dwóch wyjaśnieniach mają podobny poziom.
pd.crosstab( X_train.Fare > 100, X_train.Pclass == 1)
| Pclass | False | True |
|---|---|---|
| Fare | ||
| False | 537 | 130 |
| True | 0 | 45 |
Osoby płacące więcej niż 100 za bilet zawsze zasiadają w pierwszej klasie.